"""
process_aqua_modis_l3.py
------------------------
Processes raw NASA Aqua MODIS Level-3 Mapped (L3m) NetCDF files downloaded
from NASA EarthData into analysis-ready CSV files clipped to the Louisiana
coastal bounding box.

This script handles the nasa_data/ folder containing the original downloaded
.nc granules and converts each to a CSV preserving NaN values for cloud-
masked pixels. It is the second step in the pipeline after download_ocean_nasa.py
for cases where raw .nc files are available locally.

Pipeline position:
    download_ocean_nasa.py  -->  [this script]  -->  analyze_all_years.py

Output: one CSV per input .nc file written to processed_data/

Usage:
    python process_aqua_modis_l3.py
"""

import os
import glob
import numpy as np
import pandas as pd
import xarray as xr


# --- Input / Output ---
INPUT_DIR  = "nasa_data"       # Raw Aqua MODIS L3m NetCDF files
OUTPUT_DIR = "processed_data"  # Processed CSV output

# --- Bounding Box: Louisiana Shellfish Harvest Areas ---
LON_MIN, LAT_MIN = -95.575013, 28.425522
LON_MAX, LAT_MAX = -88.630602, 30.711210


def kelvin_to_celsius(series):
    """Convert SST from Kelvin to Celsius if values indicate Kelvin scale."""
    if series.mean(skipna=True) > 200:
        return series - 273.15
    return series


def extract_date_from_filename(filename):
    """
    Attempt to extract an ISO date string from a standard Aqua MODIS
    L3m filename (e.g., AQUA_MODIS.20220115.L3m.DAY.SST.sst.4km.nc).
    Returns None if no date pattern is found.
    """
    import re
    match = re.search(r"(\d{8})", filename)
    if match:
        raw = match.group(1)
        try:
            from datetime import datetime
            return datetime.strptime(raw, "%Y%m%d").strftime("%Y-%m-%d")
        except ValueError:
            return None
    return None


def process_file(nc_path):
    """
    Reads a single Aqua MODIS L3m NetCDF file, clips to the Louisiana
    bounding box, and writes the result to CSV in OUTPUT_DIR.

    Cloud-masked pixels (NaN) are preserved — no imputation is applied.

    Returns True on success, False otherwise.
    """
    filename = os.path.basename(nc_path)
    csv_name = filename.replace(".nc", ".csv")
    output_path = os.path.join(OUTPUT_DIR, csv_name)

    print(f"Processing {filename}...")

    ds = None
    try:
        ds = xr.open_dataset(nc_path)

        # Aqua MODIS L3m SST variable name
        sst_var = "sst"
        if sst_var not in ds:
            # Some products use 'sea_surface_temperature' or 'SST'
            candidates = [v for v in ds.data_vars if "sst" in v.lower()]
            if not candidates:
                print(f"  Skipping {filename}: no SST variable found.")
                return False
            sst_var = candidates[0]

        # Extract coordinate arrays
        lat = ds["lat"].values.flatten() if "lat" in ds.coords else ds["latitude"].values.flatten()
        lon = ds["lon"].values.flatten() if "lon" in ds.coords else ds["longitude"].values.flatten()
        sst = ds[sst_var].values.flatten().astype(float)

        # Apply fill value masking
        fill_val = ds[sst_var].attrs.get("_FillValue", None)
        if fill_val is not None:
            sst = np.where(sst == fill_val, np.nan, sst)

        df = pd.DataFrame({"lat": lat, "lon": lon, "sst": sst})

        # Convert to Celsius if needed
        df["sst"] = kelvin_to_celsius(df["sst"])

        # Clip to Louisiana bounding box
        df = df[
            (df["lat"] >= LAT_MIN) & (df["lat"] <= LAT_MAX) &
            (df["lon"] >= LON_MIN) & (df["lon"] <= LON_MAX)
        ].copy()

        # Drop rows with no SST value (land pixels or sensor gaps)
        df = df.dropna(subset=["sst"])

        if df.empty:
            print(f"  No data within bounding box for {filename}.")
            return False

        # Attach date metadata if recoverable from filename
        date_str = extract_date_from_filename(filename)
        if date_str:
            df["date"] = date_str
        elif "time" in ds:
            time_vals = ds["time"].values
            df["date"] = str(time_vals[0] if time_vals.size > 1 else time_vals)

        df.to_csv(output_path, index=False)
        print(f"  Saved {output_path} ({len(df):,} records)")
        return True

    except Exception as e:
        print(f"  Error processing {filename}: {e}")
        return False

    finally:
        if ds is not None:
            ds.close()


def main():
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    nc_files = sorted(glob.glob(os.path.join(INPUT_DIR, "*.nc")))

    if not nc_files:
        print(f"No .nc files found in '{INPUT_DIR}'. "
              f"Run download_ocean_nasa.py first or point INPUT_DIR to your NetCDF folder.")
        return

    print(f"Found {len(nc_files)} Aqua MODIS L3m file(s). Processing...\n")

    success = sum(process_file(f) for f in nc_files)

    print(f"\nDone. {success}/{len(nc_files)} files converted to CSV in '{OUTPUT_DIR}/'.")


if __name__ == "__main__":
    main()